Importing Libraries¶
In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
Importing Datasets¶
In [23]:
# Load the dataset
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df.head()
Out[23]:
| Category | pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.nprocs64bit | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | ... | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.interactive_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Benign | 45 | 17 | 10.555556 | 0 | 202.844444 | 1694 | 38.500000 | 9129 | 212.302326 | ... | 221 | 26 | 24 | 116 | 0 | 121 | 87 | 0 | 8 | Benign |
| 1 | Benign | 47 | 19 | 11.531915 | 0 | 242.234043 | 2074 | 44.127660 | 11385 | 242.234043 | ... | 222 | 26 | 24 | 118 | 0 | 122 | 87 | 0 | 8 | Benign |
| 2 | Benign | 40 | 14 | 14.725000 | 0 | 288.225000 | 1932 | 48.300000 | 11529 | 288.225000 | ... | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign |
| 3 | Benign | 32 | 13 | 13.500000 | 0 | 264.281250 | 1445 | 45.156250 | 8457 | 264.281250 | ... | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign |
| 4 | Benign | 42 | 16 | 11.452381 | 0 | 281.333333 | 2067 | 49.214286 | 11816 | 281.333333 | ... | 222 | 26 | 24 | 118 | 0 | 124 | 87 | 0 | 8 | Benign |
5 rows × 57 columns
Data Description¶
In [24]:
# Display the columns
print(df.columns)
# Show summary statistics
print(df.describe())
Index(['Category', 'pslist.nproc', 'pslist.nppid', 'pslist.avg_threads',
'pslist.nprocs64bit', 'pslist.avg_handlers', 'dlllist.ndlls',
'dlllist.avg_dlls_per_proc', 'handles.nhandles',
'handles.avg_handles_per_proc', 'handles.nport', 'handles.nfile',
'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread',
'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load',
'ldrmodules.not_in_init', 'ldrmodules.not_in_mem',
'ldrmodules.not_in_load_avg', 'ldrmodules.not_in_init_avg',
'ldrmodules.not_in_mem_avg', 'malfind.ninjections',
'malfind.commitCharge', 'malfind.protection',
'malfind.uniqueInjections', 'psxview.not_in_pslist',
'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles',
'psxview.not_in_session', 'psxview.not_in_deskthrd',
'psxview.not_in_pslist_false_avg',
'psxview.not_in_eprocess_pool_false_avg',
'psxview.not_in_ethread_pool_false_avg',
'psxview.not_in_pspcid_list_false_avg',
'psxview.not_in_csrss_handles_false_avg',
'psxview.not_in_session_false_avg', 'psxview.not_in_deskthrd_false_avg',
'modules.nmodules', 'svcscan.nservices', 'svcscan.kernel_drivers',
'svcscan.fs_drivers', 'svcscan.process_services',
'svcscan.shared_process_services',
'svcscan.interactive_process_services', 'svcscan.nactive',
'callbacks.ncallbacks', 'callbacks.nanonymous', 'callbacks.ngeneric',
'Class'],
dtype='object')
pslist.nproc pslist.nppid pslist.avg_threads pslist.nprocs64bit \
count 58596.000000 58596.000000 58596.000000 58596.0
mean 41.394771 14.713837 11.341655 0.0
std 5.777249 2.656748 1.588231 0.0
min 21.000000 8.000000 1.650000 0.0
25% 40.000000 12.000000 9.972973 0.0
50% 41.000000 15.000000 11.000000 0.0
75% 43.000000 16.000000 12.861955 0.0
max 240.000000 72.000000 16.818182 0.0
pslist.avg_handlers dlllist.ndlls dlllist.avg_dlls_per_proc \
count 58596.000000 58596.000000 58596.000000
mean 247.509819 1810.805447 43.707806
std 111.857790 329.782639 5.742023
min 34.962500 670.000000 7.333333
25% 208.725000 1556.000000 38.833333
50% 243.963710 1735.000000 42.781524
75% 289.974322 2087.000000 49.605280
max 24845.951220 3443.000000 53.170732
handles.nhandles handles.avg_handles_per_proc handles.nport ... \
count 5.859600e+04 58596.000000 58596.0 ...
mean 1.025858e+04 249.560958 0.0 ...
std 4.866864e+03 145.999866 0.0 ...
min 3.514000e+03 71.139241 0.0 ...
25% 8.393000e+03 209.648228 0.0 ...
50% 9.287500e+03 247.208951 0.0 ...
75% 1.219300e+04 291.355050 0.0 ...
max 1.047310e+06 33784.193550 0.0 ...
svcscan.nservices svcscan.kernel_drivers svcscan.fs_drivers \
count 58596.000000 58596.000000 58596.000000
mean 391.347549 221.406581 25.996245
std 4.529704 1.991087 0.170790
min 94.000000 55.000000 6.000000
25% 389.000000 221.000000 26.000000
50% 389.000000 221.000000 26.000000
75% 395.000000 222.000000 26.000000
max 395.000000 222.000000 26.000000
svcscan.process_services svcscan.shared_process_services \
count 58596.000000 58596.000000
mean 25.063417 116.879514
std 1.529628 1.550401
min 7.000000 26.000000
25% 24.000000 116.000000
50% 24.000000 116.000000
75% 27.000000 118.000000
max 27.000000 118.000000
svcscan.interactive_process_services svcscan.nactive \
count 58596.0 58596.000000
mean 0.0 121.995546
std 0.0 2.822858
min 0.0 30.000000
25% 0.0 121.000000
50% 0.0 122.000000
75% 0.0 123.000000
max 0.0 129.000000
callbacks.ncallbacks callbacks.nanonymous callbacks.ngeneric
count 58596.000000 58596.000000 58596.000000
mean 86.905659 0.000853 7.999881
std 3.134117 0.029199 0.010929
min 50.000000 0.000000 7.000000
25% 87.000000 0.000000 8.000000
50% 87.000000 0.000000 8.000000
75% 88.000000 0.000000 8.000000
max 89.000000 1.000000 8.000000
[8 rows x 55 columns]
Density-KDE Plot¶
In [25]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Scatter plot or Density-KDE plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='pslist.nproc', y='pslist.avg_threads')
plt.xlabel('Number of Processes')
plt.ylabel('Average Threads')
plt.title('Scatter Plot: Number of Processes vs Average Threads')
plt.show()
In [8]:
Data Preprocessing¶
In [26]:
df.isna()
Out[26]:
| Category | pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.nprocs64bit | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | ... | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.interactive_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 58591 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 58592 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 58593 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 58594 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 58595 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
58596 rows × 57 columns
In [27]:
import pandas as pd
#Removing NaN values
df.dropna(inplace=True)
df.head()
Out[27]:
| Category | pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.nprocs64bit | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | ... | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.interactive_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Benign | 45 | 17 | 10.555556 | 0 | 202.844444 | 1694 | 38.500000 | 9129 | 212.302326 | ... | 221 | 26 | 24 | 116 | 0 | 121 | 87 | 0 | 8 | Benign |
| 1 | Benign | 47 | 19 | 11.531915 | 0 | 242.234043 | 2074 | 44.127660 | 11385 | 242.234043 | ... | 222 | 26 | 24 | 118 | 0 | 122 | 87 | 0 | 8 | Benign |
| 2 | Benign | 40 | 14 | 14.725000 | 0 | 288.225000 | 1932 | 48.300000 | 11529 | 288.225000 | ... | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign |
| 3 | Benign | 32 | 13 | 13.500000 | 0 | 264.281250 | 1445 | 45.156250 | 8457 | 264.281250 | ... | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign |
| 4 | Benign | 42 | 16 | 11.452381 | 0 | 281.333333 | 2067 | 49.214286 | 11816 | 281.333333 | ... | 222 | 26 | 24 | 118 | 0 | 124 | 87 | 0 | 8 | Benign |
5 rows × 57 columns
In [28]:
df.fillna(df.mean())
<ipython-input-28-a2478f315f9e>:1: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. df.fillna(df.mean())
Out[28]:
| Category | pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.nprocs64bit | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | ... | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.interactive_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Benign | 45 | 17 | 10.555556 | 0 | 202.844444 | 1694 | 38.500000 | 9129 | 212.302326 | ... | 221 | 26 | 24 | 116 | 0 | 121 | 87 | 0 | 8 | Benign |
| 1 | Benign | 47 | 19 | 11.531915 | 0 | 242.234043 | 2074 | 44.127660 | 11385 | 242.234043 | ... | 222 | 26 | 24 | 118 | 0 | 122 | 87 | 0 | 8 | Benign |
| 2 | Benign | 40 | 14 | 14.725000 | 0 | 288.225000 | 1932 | 48.300000 | 11529 | 288.225000 | ... | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign |
| 3 | Benign | 32 | 13 | 13.500000 | 0 | 264.281250 | 1445 | 45.156250 | 8457 | 264.281250 | ... | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign |
| 4 | Benign | 42 | 16 | 11.452381 | 0 | 281.333333 | 2067 | 49.214286 | 11816 | 281.333333 | ... | 222 | 26 | 24 | 118 | 0 | 124 | 87 | 0 | 8 | Benign |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 58591 | Ransomware-Shade-fa03be3078d1b9840f06745f160eb... | 37 | 15 | 10.108108 | 0 | 215.486487 | 1453 | 39.270270 | 7973 | 215.486487 | ... | 221 | 26 | 24 | 116 | 0 | 120 | 86 | 0 | 8 | Malware |
| 58592 | Ransomware-Shade-f56687137caf9a67678cde91e4614... | 37 | 14 | 9.945946 | 0 | 190.216216 | 1347 | 36.405405 | 7038 | 190.216216 | ... | 221 | 26 | 24 | 116 | 0 | 116 | 88 | 0 | 8 | Malware |
| 58593 | Ransomware-Shade-faddeea111a25da4d0888f3044ae9... | 38 | 15 | 9.842105 | 0 | 210.026316 | 1448 | 38.105263 | 7982 | 215.729730 | ... | 221 | 26 | 24 | 116 | 0 | 120 | 88 | 0 | 8 | Malware |
| 58594 | Ransomware-Shade-f866c086af2e1d8ebaa6f2c863157... | 37 | 15 | 10.243243 | 0 | 215.513513 | 1452 | 39.243243 | 7974 | 215.513513 | ... | 221 | 26 | 24 | 116 | 0 | 120 | 87 | 0 | 8 | Malware |
| 58595 | Ransomware-Shade-955d9af38346c1755527bd196668e... | 38 | 15 | 9.868421 | 0 | 213.026316 | 1487 | 39.131579 | 8095 | 213.026316 | ... | 221 | 26 | 24 | 116 | 0 | 120 | 86 | 0 | 8 | Malware |
58596 rows × 57 columns
In [30]:
from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
# Convert categorical values to float values
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Class'] = label_encoder.fit_transform(df['Class'])
# Print the transformed DataFrame
print(df)
Category pslist.nproc pslist.nppid pslist.avg_threads \
0 0 45 17 10.555556
1 0 47 19 11.531915
2 0 40 14 14.725000
3 0 32 13 13.500000
4 0 42 16 11.452381
... ... ... ... ...
58591 9362 37 15 10.108108
58592 9282 37 14 9.945946
58593 9411 38 15 9.842105
58594 9325 37 15 10.243243
58595 9042 38 15 9.868421
pslist.nprocs64bit pslist.avg_handlers dlllist.ndlls \
0 0 202.844444 1694
1 0 242.234043 2074
2 0 288.225000 1932
3 0 264.281250 1445
4 0 281.333333 2067
... ... ... ...
58591 0 215.486487 1453
58592 0 190.216216 1347
58593 0 210.026316 1448
58594 0 215.513513 1452
58595 0 213.026316 1487
dlllist.avg_dlls_per_proc handles.nhandles \
0 38.500000 9129
1 44.127660 11385
2 48.300000 11529
3 45.156250 8457
4 49.214286 11816
... ... ...
58591 39.270270 7973
58592 36.405405 7038
58593 38.105263 7982
58594 39.243243 7974
58595 39.131579 8095
handles.avg_handles_per_proc ... svcscan.kernel_drivers \
0 212.302326 ... 221
1 242.234043 ... 222
2 288.225000 ... 222
3 264.281250 ... 222
4 281.333333 ... 222
... ... ... ...
58591 215.486487 ... 221
58592 190.216216 ... 221
58593 215.729730 ... 221
58594 215.513513 ... 221
58595 213.026316 ... 221
svcscan.fs_drivers svcscan.process_services \
0 26 24
1 26 24
2 26 27
3 26 27
4 26 24
... ... ...
58591 26 24
58592 26 24
58593 26 24
58594 26 24
58595 26 24
svcscan.shared_process_services svcscan.interactive_process_services \
0 116 0
1 118 0
2 118 0
3 118 0
4 118 0
... ... ...
58591 116 0
58592 116 0
58593 116 0
58594 116 0
58595 116 0
svcscan.nactive callbacks.ncallbacks callbacks.nanonymous \
0 121 87 0
1 122 87 0
2 120 88 0
3 120 88 0
4 124 87 0
... ... ... ...
58591 120 86 0
58592 116 88 0
58593 120 88 0
58594 120 87 0
58595 120 86 0
callbacks.ngeneric Class
0 8 0
1 8 0
2 8 0
3 8 0
4 8 0
... ... ...
58591 8 1
58592 8 1
58593 8 1
58594 8 1
58595 8 1
[58596 rows x 57 columns]
In [31]:
Out[31]:
| Category | pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.nprocs64bit | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | ... | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.interactive_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45 | 17 | 10.555556 | 0 | 202.844444 | 1694 | 38.500000 | 9129 | 212.302326 | ... | 221 | 26 | 24 | 116 | 0 | 121 | 87 | 0 | 8 | 0 |
| 1 | 0 | 47 | 19 | 11.531915 | 0 | 242.234043 | 2074 | 44.127660 | 11385 | 242.234043 | ... | 222 | 26 | 24 | 118 | 0 | 122 | 87 | 0 | 8 | 0 |
| 2 | 0 | 40 | 14 | 14.725000 | 0 | 288.225000 | 1932 | 48.300000 | 11529 | 288.225000 | ... | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | 0 |
| 3 | 0 | 32 | 13 | 13.500000 | 0 | 264.281250 | 1445 | 45.156250 | 8457 | 264.281250 | ... | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | 0 |
| 4 | 0 | 42 | 16 | 11.452381 | 0 | 281.333333 | 2067 | 49.214286 | 11816 | 281.333333 | ... | 222 | 26 | 24 | 118 | 0 | 124 | 87 | 0 | 8 | 0 |
5 rows × 57 columns
In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
# Load the dataset
df = pd.read_csv('Obfuscated-MalMem2022.csv')
df['Class'] = df['Class'].astype('category').cat.codes
# Diagnostic Analytics for Text Data (Word Clouds)
text_columns = ['Category']
for column in text_columns:
if df[column].notnull().any(): # Check if column contains at least one non-null value
plt.figure(figsize=(8, 6))
wordcloud = WordCloud(background_color='white').generate(' '.join(df[column].astype(str)))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(f'Word Cloud - {column}')
plt.axis('off')
plt.show()
else:
print(f"No data available for word cloud - {column}")
# Diagnostic Analytics for Numerical Data (Correlation Plot)
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True)
plt.title('Correlation Heatmap')
plt.show()
# Diagnostic Analytics for Numerical Data (Scatterplot with Trendline)
numerical_columns = ['pslist.nproc', 'pslist.avg_threads', 'handles.nhandles']
for column in numerical_columns:
plt.figure(figsize=(8, 6))
sns.regplot(x=column, y='Class', data=df, scatter_kws={'alpha': 0.5})
plt.title(f'Scatterplot with Trendline - {column}')
plt.xlabel(column)
plt.ylabel('Class')
plt.show()
<ipython-input-32-1d053cf7870a>:26: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix = df.corr()
In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
df['Category'] = df['Category'].astype('category').cat.codes
# Split the dataset into features (X) and target variable (y)
X = df.drop('Class', axis=1)
y = df['Class']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a Random Forest classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
# Predict the target variable for the test set
y_pred = classifier.predict(X_test)
# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_report}")
Accuracy: 1.0
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 5790
1 1.00 1.00 1.00 5930
accuracy 1.00 11720
macro avg 1.00 1.00 1.00 11720
weighted avg 1.00 1.00 1.00 11720
In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Select only the numerical columns
numeric_cols = df.drop("Class", axis=1).select_dtypes(include=[float, int])
# Histogram of all columns
for col in numeric_cols.columns:
if col != "ldrmodules.not_in_mem_avg":
sns.histplot(df[col], kde=True)
plt.title(f"Histogram of {col}")
plt.xlabel(col)
plt.ylabel("Frequency")
plt.show()
# Pair plots of all columns
sns.pairplot(df, hue="Class", diag_kind="hist")
plt.show()
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-8-7f201c040420> in <cell line: 9>() 9 for col in numeric_cols.columns: 10 if col != "ldrmodules.not_in_mem_avg": ---> 11 sns.histplot(df[col], kde=True) 12 plt.title(f"Histogram of {col}") 13 plt.xlabel(col) /usr/local/lib/python3.10/dist-packages/seaborn/distributions.py in histplot(data, x, y, hue, weights, stat, bins, binwidth, binrange, discrete, cumulative, common_bins, common_norm, multiple, element, fill, shrink, kde, kde_kws, line_kws, thresh, pthresh, pmax, cbar, cbar_ax, cbar_kws, palette, hue_order, hue_norm, color, log_scale, legend, ax, **kwargs) 1430 if p.univariate: 1431 -> 1432 p.plot_univariate_histogram( 1433 multiple=multiple, 1434 element=element, /usr/local/lib/python3.10/dist-packages/seaborn/distributions.py in plot_univariate_histogram(self, multiple, element, fill, common_norm, common_bins, shrink, kde, kde_kws, color, legend, line_kws, estimate_kws, **plot_kws) 573 574 plot_func = ax.bar if self.data_variable == "x" else ax.barh --> 575 artists = plot_func( 576 hist["edges"], 577 hist["heights"] - bottom, /usr/local/lib/python3.10/dist-packages/matplotlib/__init__.py in inner(ax, data, *args, **kwargs) 1440 def inner(ax, *args, data=None, **kwargs): 1441 if data is None: -> 1442 return func(ax, *map(sanitize_sequence, args), **kwargs) 1443 1444 bound = new_sig.bind(ax, *args, **kwargs) /usr/local/lib/python3.10/dist-packages/matplotlib/axes/_axes.py in bar(self, x, height, width, bottom, align, **kwargs) 2492 else: # horizontal 2493 r.sticky_edges.x.append(l) -> 2494 self.add_patch(r) 2495 patches.append(r) 2496 /usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py in add_patch(self, p) 2377 if p.get_clip_path() is None: 2378 p.set_clip_path(self.patch) -> 2379 self._update_patch_limits(p) 2380 self._children.append(p) 2381 p._remove_method = self._children.remove /usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py in _update_patch_limits(self, patch) 2399 # Loop through each segment to get extrema for Bezier curve sections 2400 vertices = [] -> 2401 for curve, code in p.iter_bezier(simplify=False): 2402 # Get distance along the curve of any extrema 2403 _, dzeros = curve.axis_aligned_extrema() /usr/local/lib/python3.10/dist-packages/matplotlib/path.py in iter_bezier(self, **kwargs) 447 if code == Path.MOVETO: # a point is like "CURVE1" 448 first_vert = verts --> 449 yield BezierSegment(np.array([first_vert])), code 450 elif code == Path.LINETO: # "CURVE2" 451 yield BezierSegment(np.array([prev_vert, verts])), code /usr/local/lib/python3.10/dist-packages/matplotlib/bezier.py in __init__(self, control_points) 192 self._cpoints = np.asarray(control_points) 193 self._N, self._d = self._cpoints.shape --> 194 self._orders = np.arange(self._N) 195 coeff = [math.factorial(self._N - 1) 196 // (math.factorial(i) * math.factorial(self._N - 1 - i)) KeyboardInterrupt:
Error in callback <function _draw_all_if_interactive at 0x7fb2e3531000> (for post_execute):
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) /usr/local/lib/python3.10/dist-packages/matplotlib/pyplot.py in _draw_all_if_interactive() 118 def _draw_all_if_interactive(): 119 if matplotlib.is_interactive(): --> 120 draw_all() 121 122 /usr/local/lib/python3.10/dist-packages/matplotlib/_pylab_helpers.py in draw_all(cls, force) 130 for manager in cls.get_all_fig_managers(): 131 if force or manager.canvas.figure.stale: --> 132 manager.canvas.draw_idle() 133 134 /usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in draw_idle(self, *args, **kwargs) 2080 if not self._is_idle_drawing: 2081 with self._idle_draw_cntx(): -> 2082 self.draw(*args, **kwargs) 2083 2084 @property /usr/local/lib/python3.10/dist-packages/matplotlib/backends/backend_agg.py in draw(self) 398 (self.toolbar._wait_cursor_for_draw_cm() if self.toolbar 399 else nullcontext()): --> 400 self.figure.draw(self.renderer) 401 # A GUI class may be need to update a window using this draw, so 402 # don't forget to call the superclass. /usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs) 93 @wraps(draw) 94 def draw_wrapper(artist, renderer, *args, **kwargs): ---> 95 result = draw(artist, renderer, *args, **kwargs) 96 if renderer._rasterizing: 97 renderer.stop_rasterizing() /usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer) 70 renderer.start_filter() 71 ---> 72 return draw(artist, renderer) 73 finally: 74 if artist.get_agg_filter() is not None: /usr/local/lib/python3.10/dist-packages/matplotlib/figure.py in draw(self, renderer) 3138 3139 self.patch.draw(renderer) -> 3140 mimage._draw_list_compositing_images( 3141 renderer, self, artists, self.suppressComposite) 3142 /usr/local/lib/python3.10/dist-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite) 129 if not_composite or not has_images: 130 for a in artists: --> 131 a.draw(renderer) 132 else: 133 # Composite any adjacent images together /usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer) 70 renderer.start_filter() 71 ---> 72 return draw(artist, renderer) 73 finally: 74 if artist.get_agg_filter() is not None: /usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py in draw(self, renderer) 3062 _draw_rasterized(self.figure, artists_rasterized, renderer) 3063 -> 3064 mimage._draw_list_compositing_images( 3065 renderer, self, artists, self.figure.suppressComposite) 3066 /usr/local/lib/python3.10/dist-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite) 129 if not_composite or not has_images: 130 for a in artists: --> 131 a.draw(renderer) 132 else: 133 # Composite any adjacent images together /usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer) 70 renderer.start_filter() 71 ---> 72 return draw(artist, renderer) 73 finally: 74 if artist.get_agg_filter() is not None: /usr/local/lib/python3.10/dist-packages/matplotlib/patches.py in draw(self, renderer) 589 tpath = transform.transform_path_non_affine(path) 590 affine = transform.get_affine() --> 591 self._draw_paths_with_artist_properties( 592 renderer, 593 [(tpath, affine, /usr/local/lib/python3.10/dist-packages/matplotlib/patches.py in _draw_paths_with_artist_properties(self, renderer, draw_path_args_list) 543 544 renderer.open_group('patch', self.get_gid()) --> 545 gc = renderer.new_gc() 546 547 gc.set_foreground(self._edgecolor, isRGBA=True) /usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in new_gc(self) 683 def new_gc(self): 684 """Return an instance of a `.GraphicsContextBase`.""" --> 685 return GraphicsContextBase() 686 687 def points_to_pixels(self, points): /usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in __init__(self) 762 self._forced_alpha = False # if True, _alpha overrides A from RGBA 763 self._antialiased = 1 # use 0, 1 not True, False for extension code --> 764 self._capstyle = CapStyle('butt') 765 self._cliprect = None 766 self._clippath = None /usr/lib/python3.10/enum.py in __call__(cls, value, names, module, qualname, type, start) 357 return True 358 --> 359 def __call__(cls, value, names=None, *, module=None, qualname=None, type=None, start=1): 360 """ 361 Either returns an existing member, or creates a new enum class. KeyboardInterrupt:
Error in callback <function flush_figures at 0x7fb2e3530280> (for post_execute):
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) /usr/local/lib/python3.10/dist-packages/matplotlib_inline/backend_inline.py in flush_figures() 124 # ignore the tracking, just draw and close all figures 125 try: --> 126 return show(True) 127 except Exception as e: 128 # safely show traceback if in IPython, else raise /usr/local/lib/python3.10/dist-packages/matplotlib_inline/backend_inline.py in show(close, block) 88 try: 89 for figure_manager in Gcf.get_all_fig_managers(): ---> 90 display( 91 figure_manager.canvas.figure, 92 metadata=_fetch_figure_metadata(figure_manager.canvas.figure) /usr/local/lib/python3.10/dist-packages/IPython/core/display.py in display(include, exclude, metadata, transient, display_id, *objs, **kwargs) 318 publish_display_data(data=obj, metadata=metadata, **kwargs) 319 else: --> 320 format_dict, md_dict = format(obj, include=include, exclude=exclude) 321 if not format_dict: 322 # nothing to display (e.g. _ipython_display_ took over) /usr/local/lib/python3.10/dist-packages/IPython/core/formatters.py in format(self, obj, include, exclude) 178 md = None 179 try: --> 180 data = formatter(obj) 181 except: 182 # FIXME: log the exception <decorator-gen-2> in __call__(self, obj) /usr/local/lib/python3.10/dist-packages/IPython/core/formatters.py in catch_format_error(method, self, *args, **kwargs) 222 """show traceback on failed format call""" 223 try: --> 224 r = method(self, *args, **kwargs) 225 except NotImplementedError: 226 # don't warn on NotImplementedErrors /usr/local/lib/python3.10/dist-packages/IPython/core/formatters.py in __call__(self, obj) 339 pass 340 else: --> 341 return printer(obj) 342 # Finally look for special method names 343 method = get_real_method(obj, self.print_method) /usr/local/lib/python3.10/dist-packages/IPython/core/pylabtools.py in print_figure(fig, fmt, bbox_inches, base64, **kwargs) 149 FigureCanvasBase(fig) 150 --> 151 fig.canvas.print_figure(bytes_io, **kw) 152 data = bytes_io.getvalue() 153 if fmt == 'svg': /usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, pad_inches, bbox_extra_artists, backend, **kwargs) 2340 ) 2341 with getattr(renderer, "_draw_disabled", nullcontext)(): -> 2342 self.figure.draw(renderer) 2343 2344 if bbox_inches: /usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs) 93 @wraps(draw) 94 def draw_wrapper(artist, renderer, *args, **kwargs): ---> 95 result = draw(artist, renderer, *args, **kwargs) 96 if renderer._rasterizing: 97 renderer.stop_rasterizing() /usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer) 70 renderer.start_filter() 71 ---> 72 return draw(artist, renderer) 73 finally: 74 if artist.get_agg_filter() is not None: /usr/local/lib/python3.10/dist-packages/matplotlib/figure.py in draw(self, renderer) 3138 3139 self.patch.draw(renderer) -> 3140 mimage._draw_list_compositing_images( 3141 renderer, self, artists, self.suppressComposite) 3142 /usr/local/lib/python3.10/dist-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite) 129 if not_composite or not has_images: 130 for a in artists: --> 131 a.draw(renderer) 132 else: 133 # Composite any adjacent images together /usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer) 70 renderer.start_filter() 71 ---> 72 return draw(artist, renderer) 73 finally: 74 if artist.get_agg_filter() is not None: /usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py in draw(self, renderer) 3062 _draw_rasterized(self.figure, artists_rasterized, renderer) 3063 -> 3064 mimage._draw_list_compositing_images( 3065 renderer, self, artists, self.figure.suppressComposite) 3066 /usr/local/lib/python3.10/dist-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite) 129 if not_composite or not has_images: 130 for a in artists: --> 131 a.draw(renderer) 132 else: 133 # Composite any adjacent images together /usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer) 70 renderer.start_filter() 71 ---> 72 return draw(artist, renderer) 73 finally: 74 if artist.get_agg_filter() is not None: /usr/local/lib/python3.10/dist-packages/matplotlib/patches.py in draw(self, renderer) 589 tpath = transform.transform_path_non_affine(path) 590 affine = transform.get_affine() --> 591 self._draw_paths_with_artist_properties( 592 renderer, 593 [(tpath, affine, /usr/local/lib/python3.10/dist-packages/matplotlib/patches.py in _draw_paths_with_artist_properties(self, renderer, draw_path_args_list) 556 557 gc.set_antialiased(self._antialiased) --> 558 self._set_gc_clip(gc) 559 gc.set_url(self._url) 560 gc.set_snap(self.get_snap()) /usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in _set_gc_clip(self, gc) 932 if self.clipbox is not None: 933 gc.set_clip_rectangle(self.clipbox) --> 934 gc.set_clip_path(self._clippath) 935 else: 936 gc.set_clip_rectangle(None) /usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in set_clip_path(self, path) 928 def set_clip_path(self, path): 929 """Set the clip path to a `.TransformedPath` or None.""" --> 930 _api.check_isinstance((transforms.TransformedPath, None), path=path) 931 self._clippath = path 932 /usr/local/lib/python3.10/dist-packages/matplotlib/_api/__init__.py in check_isinstance(_types, **kwargs) 85 else f"{tp.__module__}.{tp.__qualname__}") 86 ---> 87 for k, v in kwargs.items(): 88 if not isinstance(v, types): 89 names = [*map(type_name, types)] KeyboardInterrupt:
In [34]:
# Visualize the distribution of each feature
sns.histplot(df["pslist.nproc"])
plt.show()
In [35]:
sns.histplot(df["callbacks.ngeneric"])
plt.show()
In [36]:
sns.histplot(df["pslist.nppid"])
plt.show()
In [37]:
sns.histplot(df["Class"])
plt.show()
In [38]:
sns.histplot(df["pslist.avg_threads"])
plt.show()
In [39]:
sns.histplot(df["dlllist.ndlls"])
plt.show()
In [40]:
sns.histplot(df["svcscan.shared_process_services"])
plt.show()
Top 5 Features Shown¶
In [41]:
# Split the dataset into features (X) and target variable (y)
X = df.drop("Class", axis=1)
y = df["Class"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
# Get feature importances
importances = rf_classifier.feature_importances_
# Create a DataFrame to store feature importances
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": importances})
feature_importances.sort_values(by="Importance", ascending=False, inplace=True)
# Plot the top five features' importance
sns.barplot(x="Importance", y="Feature", data=feature_importances.head(5))
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Top Five Features Importance")
plt.show()
In [42]:
# Split the dataset into features (X) and target variable (y)
X = df.drop("Class", axis=1)
y = df["Class"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
# Predict on the testing set
y_pred = rf_classifier.predict(X_test)
# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
Accuracy: 1.0 Precision: 1.0 Recall: 1.0 F1-Score: 1.0
In [ ]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True)
plt.title("Correlation Heatmap")
plt.show()
In [ ]:
from sklearn.metrics import classification_report
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
# Calculate feature importances
importances = rf_classifier.feature_importances_
# Sort feature importances in descending order
sorted_indices = np.argsort(importances)[::-1]
sorted_features = X.columns[sorted_indices]
# Plot feature importance
plt.figure(figsize=(10, 6))
plt.bar(sorted_features[:5], importances[sorted_indices][:5])
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Top 5 Feature Importances")
plt.xticks(rotation=45)
plt.show()
# Step 5: Classifier Building
# Train a random forest classifier on the entire dataset
rf_classifier_final = RandomForestClassifier()
rf_classifier_final.fit(X, y)
# Evaluate the classifier's performance on the testing data
y_pred = rf_classifier_final.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 1.00 1.00 1.00 5790
1 1.00 1.00 1.00 5930
accuracy 1.00 11720
macro avg 1.00 1.00 1.00 11720
weighted avg 1.00 1.00 1.00 11720
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Step 3: Diagnostic Analysis
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()
# Step 4: Feature Importance
X = df.drop("Class", axis=1)
y = df["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
importances = rf_classifier.feature_importances_
sorted_indices = np.argsort(importances)[::-1]
sorted_features = X.columns[sorted_indices]
plt.figure(figsize=(10, 6))
plt.bar(sorted_features[:5], importances[sorted_indices][:5])
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Top 5 Feature Importances")
plt.xticks(rotation=45)
plt.show()
# Step 5: Classifier Building
rf_classifier_final = RandomForestClassifier()
rf_classifier_final.fit(X, y)
y_pred = rf_classifier_final.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 1.00 1.00 1.00 5790
1 1.00 1.00 1.00 5930
accuracy 1.00 11720
macro avg 1.00 1.00 1.00 11720
weighted avg 1.00 1.00 1.00 11720
In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Compute correlation matrix
correlation_matrix = df.corr()
# Plot correlation matrix as a heatmap
plt.figure(figsize=(100, 80))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True)
plt.title("Correlation Heatmap")
plt.show()
Output hidden; open in https://colab.research.google.com to view.
New section¶
In [44]:
!jupyter nbconvert --to html BigData.ipynb
[NbConvertApp] Converting notebook BigData.ipynb to html [NbConvertApp] Writing 8951276 bytes to BigData.html
In [ ]: